In [2]:
# Import stuff
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from scipy.stats import pearsonr
from __future__ import division
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess
In [4]:
df = pd.DataFrame.from_csv('/Users/jesskerlin/Documents/GitHub/digblood/data/raw/blood_train.csv')
df.columns = [c.replace(' ', '_') for c in df.columns]
In [5]:
#Show the first few lines of the database
df[:5]
Out[5]:
In [6]:
pearsonr(df['Number_of_Donations'],df['Total_Volume_Donated_(c.c.)'])
Out[6]:
In [7]:
data = df[['Months_since_Last_Donation','Made_Donation_in_March_2007']].groupby(['Months_since_Last_Donation']).mean().plot(kind = 'bar')
In [8]:
# From http://stackoverflow.com/questions/18517722/weighted-moving-average-in-python
def weighted_moving_average(x,y,step_size=0.05,width=1):
bin_centers = np.arange(np.min(x),np.max(x)-0.5*step_size,step_size)+0.5*step_size
bin_avg = np.zeros(len(bin_centers))
#We're going to weight with a Gaussian function
def gaussian(x,amp=1,mean=0,sigma=1):
return amp*np.exp(-(x-mean)**2/(2*sigma**2))
for index in range(0,len(bin_centers)):
bin_center = bin_centers[index]
weights = gaussian(x,mean=bin_center,sigma=width)
bin_avg[index] = np.average(y,weights=weights)
return (bin_centers,bin_avg)
data = df[['Months_since_First_Donation','Made_Donation_in_March_2007']].groupby(['Months_since_First_Donation']).mean()
count = df[['Months_since_First_Donation','Made_Donation_in_March_2007']].groupby(['Months_since_First_Donation']).mean()
df = df.sort_values('Months_since_Last_Donation')
x = df['Months_since_Last_Donation'].values #.apply(lambda x: np.log(x)).
y = df['Made_Donation_in_March_2007'].values
scipy.stats.halfnorm
x_out,y_out = weighted_moving_average(x,y,step_size = 1,width = 5)
print smoothed
plt.plot(x_out,y_out)
print x
#plt.bar(data.index,data.Made_Donation_in_March_2007)
In [9]:
print x
Since Total Volume Donated adds no information, I won't include it as a feature.
In [10]:
df.drop('Total_Volume_Donated_(c.c.)', axis = 1)
model = smf.ols('Made_Donation_in_March_2007 ~ Months_since_Last_Donation + Number_of_Donations + Months_since_First_Donation', data = df)
result = model.fit()
result.summary()
result.fittedvalues
Out[10]:
In [11]:
df.keys()
Out[11]:
In [14]:
mean = df['Made_Donation_in_March_2007'].mean()
df['Means'] = np.ones([576,1])*mean
print df['Means']
In [ ]:
In [16]:
# Training evaluation
from sklearn.metrics import log_loss
pred = np.array(df.Means)
actual = df['Made_Donation_in_March_2007']
print 'Training log-loss score ' + str(log_loss(actual,pred))
In [18]:
df.describe()
Out[18]: